import pandas as pd
import math
import time
import re
import json
import ast
import numpy as np

data = pd.read_csv("Data/gpt4_bills_batch100_originallabels_seed2900.csv")

def extract_labels(labels):
    # split string into list of strings
    split_labels = labels.split('\n')
    # remove numbers and colon
    extracted_labels = [label.split(': ')[1] for label in split_labels if label != '']
    return extracted_labels


label_to_number = {"Macroeconomics": 1, 
                      "Civil Rights" : 2, 
                      "Health" : 3,
                      "Agriculture" : 4, 
                      "Labor" : 5, 
                      "Education" : 6,
                      "Environment" : 7, 
                      "Energy" : 8, 
                      "Immigration" : 9, 
                      "Transportation" : 10, 
                      "Law and Crime" : 12, 
                      "Social Welfare" : 13, 
                      "Housing" : 14,
                      "Domestic Commerce" : 15, 
                      "Defense" : 16, 
                      "Technology" : 17, 
                      "Foreign Trade" : 18, 
                      "International Affairs" : 19, 
                      "Government Operations": 20, 
                      "Public Lands" : 21, 
                      "Culture" : 23,
                      "Other" : 99
                      } 

# Create a new column in the dataframe


# Iterate over the dataframe
for i in range(0, len(data)):
    
    # GPT3.5 text labels to numerical labels 
    data.loc[i, "gpt35_num_label"] = label_to_number.get(data.loc[i, "chatgpt_response"], 199)

    # GPT4 label parsing 
    remainder = (i+1)%100
    # if the labels column is not empty then extract the labels
    if i>0 and remainder==0:
        #data.to_csv("Data/gpt_classifications.csv", index=False)
        print(data.loc[i, 'gpt4_response'])
        labels = extract_labels(data.loc[i, 'gpt4_response'])
        print(labels)
        data.loc[i-99:i, 'gpt4_labels'] = labels
        #Map gpt4 label predictions to label numbers 
        data.loc[i-99:i, 'gpt4_num_label'] = [label_to_number.get(label, 199) for label in labels]
        
        


data.to_csv("Data/gpt_classifications.csv", index=False)

